## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [conflicted] Will prefer dplyr::filter over any other package.
## [conflicted] Will prefer dplyr::select over any other package.
## [conflicted] Will prefer dplyr::slice over any other package.
## [conflicted] Will prefer dplyr::rename over any other package.
## [conflicted] Will prefer dplyr::intersect over any other package.
## Rows: 1560 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 29 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (16): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
make count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median, n_db) %>%
# only keep high abundant circRNAs
filter(count_group_median == 'count ≥ 5') %>%
# filter out circ that are NAs for amplicon sequencing validation
filter(!is.na(compound_val)) %>%
# change nr of databases to binary
mutate(n_db = ifelse(is.na(n_db), 0, 1)) %>%
group_by(n_db) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-n_db)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("novel", "in_db")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## fail pass
## novel 9.871795 67.12821
## in_db 110.128205 748.87179
chisquare test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 337.48, df = 1, p-value < 2.2e-16
OR
## [1] 57.08276
make count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median, n_detected) %>%
unique() %>%
filter(count_group_median == 'count ≥ 5') %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# change nr of tools to binary
mutate(n_detected = ifelse(n_detected == 1, 0, 1)) %>%
group_by(n_detected) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-n_detected)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("unique", "multiple_tools")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## fail pass
## unique 10.4763 67.5237
## multiple_tools 108.5237 699.4763
chisquare test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 327.23, df = 1, p-value < 2.2e-16
OR
## [1] 56.4
numbers for paper
cq %>%
filter(tools > 1) %>%
filter(qPCR_val == "fail" |
RR_val == 'fail' |
amp_seq_val == 'fail') %>%
select(circ_id) %>% unique()make count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median, nr_exons) %>%
unique() %>%
filter(count_group_median == 'count ≥ 5') %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# also remove the ones that do not have a exon annotation
filter(!is.na(nr_exons), !nr_exons == "ambiguous") %>%
mutate(exon_bin = ifelse(nr_exons == 1, 0, 1)) %>%
group_by(exon_bin) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
mutate(fail = ifelse(is.na(fail), 0, fail)) %>%
select(-exon_bin)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("single_exon", "multi_exons")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## fail pass
## single_exon 9.799439 127.2006
## multi_exons 41.200561 534.7994
chisquare test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 15.578, df = 1, p-value = 7.917e-05
OR
## [1] 3.294828
cq_start = cq %>%
mutate(start_exon_nr = substr(start_match, 19, 27),
start_exon_nr = ifelse(substr(start_exon_nr, 1, 1) == '_',
substr(start_exon_nr, 2, 10),
start_exon_nr))
cq_start cont_table = cq_start %>%
filter(count_group_median == 'count ≥ 5') %>%
select(circ_id, cell_line, compound_val, start_exon_nr) %>%
unique() %>%
filter(!is.na(start_exon_nr)) %>%
mutate(exon_1 = ifelse(start_exon_nr == "exon_1", 1, 0)) %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
group_by(exon_1) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup()
cont_table=> not enough values
cont_table = cq %>%
filter(count_group_median == 'count ≥ 5',
!strand == 'unknown') %>%
select(circ_id, cell_line, compound_val, ss_motif) %>%
unique() %>%
mutate(ss_can = ifelse(ss_motif == "AGNGT", 1, 0)) %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# change nr of databases to binary
group_by(ss_can) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-ss_can)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("non_cannonical", "cannonical")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## fail pass
## non_cannonical 4.090909 27.90909
## cannonical 85.909091 586.09091
chisquare test
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 134.58, df = 1, p-value < 2.2e-16
OR
## [1] 41.16667
cont_table = cq %>%
filter(count_group_median == "count ≥ 5") %>%
select(circ_id, cell_line, compound_val, ENST) %>%
unique() %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# make ENST match binary
mutate(ENST_group = ifelse(is.na(ENST), 'no_match', "match")) %>%
# change nr of databases to binary
group_by(ENST_group) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-ENST_group)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("match", "no_match")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## fail pass
## match 106.79143 689.20857
## no_match 12.20857 78.79143
chisquare test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 179.73, df = 1, p-value < 2.2e-16
OR
## [1] 16.41331
cont_table = cq %>%
filter(count_group_median == "count ≥ 5") %>%
left_join(read_tsv('../data/details/tool_annotation.txt')) %>%
select(circ_id, cell_line, compound_val, approach) %>%
filter(!approach == 'integrative') %>%
unique() %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
group_by(approach) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-approach)## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(tool)`
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("candidate-based", "segmented read-based")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## fail pass
## candidate-based 27.67367 165.3263
## segmented read-based 88.32633 527.6737
chisquare test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 8.2103, df = 1, p-value = 0.004165
OR
## [1] 2.327249
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median) %>%
unique() %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
group_by(count_group_median) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-count_group_median)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## fail pass
## count < 5 53.64444 230.3556
## count ≥ 5 167.35556 718.6444
chisquare test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 69.504, df = 1, p-value < 2.2e-16
OR
## [1] 3.612245
## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 32 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (6): nr_detected, nr_expected, sensitivity, perc_compound_val, total_n, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
add annotation to sensitivity
sens_anno = sens %>%
left_join(tool_anno) %>%
select(-tool_lt) %>% ungroup() %>%
filter(count_group_median == 'count ≥ 5')## Joining with `by = join_by(tool)`
##
## Wilcoxon rank sum exact test
##
## data: sensitivity by approach
## W = 11, p-value = 0.456
## alternative hypothesis: true location shift is not equal to 0
##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by approach
## Kruskal-Wallis chi-squared = 0.63636, df = 2, p-value = 0.7275
##
## Wilcoxon rank sum exact test
##
## data: sensitivity by lin_annotation
## W = 19, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ strand_anno, data=sens_anno %>%
filter(!strand_anno == "no strand reported"))##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by strand_anno
## Kruskal-Wallis chi-squared = 3.4962, df = 3, p-value = 0.3213
##
## Wilcoxon rank sum exact test
##
## data: sensitivity by splicing
## W = 28, p-value = 0.01667
## alternative hypothesis: true location shift is not equal to 0
## [1] 16
## [1] -2.39398
## [1] 0.5984949
median_diff = tibble()
for (tool_can in sens_anno %>% filter(splicing == "canonical") %>% pull(tool)) {
sens_can = sens_anno %>% filter(tool == tool_can) %>% pull(sensitivity)
for (tool_non_can in sens_anno %>% filter(splicing == "non-canonical") %>% pull(tool)) {
sens_non_can = sens_anno %>% filter(tool == tool_non_can) %>% pull(sensitivity)
median_diff = median_diff %>%
bind_rows(tibble(tool_can, sens_can, tool_non_can, sens_non_can))
}
}
median_diff = median_diff %>%
mutate(sens_diff = sens_can - sens_non_can)
median_diff## [1] 0.5026076
##
## Wilcoxon rank sum exact test
##
## data: sensitivity by BSJ_filter
## W = 27, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
below 5
sens_tmp = sens %>% filter(count_group_median == "count < 5",
!tool == "Sailfish-cir")
cor.test(sens_tmp$sensitivity, sens_tmp$extrapolated_sensitivity, method = 'spearman')## Warning in cor.test.default(sens_tmp$sensitivity,
## sens_tmp$extrapolated_sensitivity, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: sens_tmp$sensitivity and sens_tmp$extrapolated_sensitivity
## S = 62.17, p-value = 0.0004567
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8292042
above 5
sens_tmp = sens %>% filter(count_group_median == "count ≥ 5",
!tool == "Sailfish-cir")
cor.test(sens_tmp$sensitivity, sens_tmp$extrapolated_sensitivity, method = 'spearman')##
## Spearman's rank correlation rho
##
## data: sens_tmp$sensitivity and sens_tmp$extrapolated_sensitivity
## S = 124, p-value = 0.000988
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7785714
make count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median) %>%
unique() %>%
# only keep high abundant circRNAs
filter(count_group_median == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(compound_val)) %>%
# change nr of databases to binary
group_by(cell_line) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-cell_line)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("HLF", "NCI-H23", "SW480")
cont_tableplot data
if chisquare expected < 5 => then Fisher should be used
## fail pass
## HLF 47.14334 303.8567
## NCI-H23 38.01016 244.9898
## SW480 33.84650 218.1535
chisquare test
##
## Pearson's Chi-squared test
##
## data: cont_table
## X-squared = 23.691, df = 2, p-value = 7.171e-06
first calculate the total nr of validated circ per count group
nr_val_cl = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
group_by(count_group_median, cell_line) %>%
summarise(nr_expected = n())## `summarise()` has grouped output by 'count_group_median'. You can override
## using the `.groups` argument.
then calculate sensitivity by dividing nr of circ found by total
sens_cl = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
# check witch tools have detected these
left_join(all_circ %>%
select(tool, circ_id, cell_line) %>% unique()) %>%
group_by(tool, count_group_median, cell_line) %>%
summarise(nr_detected = n()) %>%
left_join(nr_val_cl) %>%
mutate(sensitivity = nr_detected/nr_expected) %>%
ungroup()## Joining with `by = join_by(circ_id, cell_line)`
## `summarise()` has grouped output by 'tool', 'count_group_median'. You can
## override using the `.groups` argument.
## Joining with `by = join_by(count_group_median, cell_line)`
then ANOVA
sens_cl_anova = sens_cl %>%
filter(count_group_median == 'count ≥ 5') %>%
select(tool, cell_line, sensitivity)
sens_cl_anovasens_cl_anova$cell_line = factor(sens_cl_anova$cell_line, levels = c('HLF', 'NCI-H23', 'SW480'))
sens_cl_anova$tool = factor(sens_cl_anova$tool, levels = c("CIRCexplorer3", "CirComPara2", "circRNA_finder", "circseq_cup", "CircSplice", "circtools","CIRI2", "CIRIquant", "ecircscreen","find_circ", "KNIFE", "NCLcomparator", "NCLscan", "PFv2","Sailfish-cir", "segemehl"))
res.aov = aov(sensitivity ~ cell_line+tool, data = sens_cl_anova)
summary(res.aov)## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.0122 0.00609 4.048 0.0278 *
## tool 15 1.6454 0.10969 72.866 <2e-16 ***
## Residuals 30 0.0452 0.00151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] 0.00194091
## [1] 0.9750377
recalculate the precision per cell line
val_cl = cq %>%
select(tool, circ_id, cell_line, count_group, qPCR_val, RR_val, RR_val_detail, amp_seq_val, amp_seq_val_detail, compound_val) %>%
group_by(tool, count_group, cell_line) %>%
summarise(nr_qPCR_total = n(),
nr_qPCR_fail = sum(qPCR_val == 'fail'),
nr_qPCR_val = sum(qPCR_val == 'pass'),
nr_RR_total = n() - sum(is.na(RR_val)), # here NA are the ones that have are 'out_of_range'
nr_RR_fail = sum(RR_val == "fail", na.rm = T),
nr_RR_val = sum(RR_val == "pass", na.rm = T),
nr_amp_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
nr_amp_fail = sum(amp_seq_val == "fail", na.rm = T),
nr_amp_val = sum(amp_seq_val == "pass", na.rm = T),
nr_compound_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included
nr_compound_fail = sum(compound_val == "fail", na.rm = T),
nr_compound_val = sum(compound_val == "pass", na.rm = T)) %>%
mutate(perc_qPCR_val = nr_qPCR_val/nr_qPCR_total,
perc_RR_val = nr_RR_val/nr_RR_total,
perc_amp_val = nr_amp_val/nr_amp_total,
perc_compound_val = nr_compound_val/nr_compound_total) %>%
ungroup()## `summarise()` has grouped output by 'tool', 'count_group'. You can override
## using the `.groups` argument.
ANOVA
val_cl_anova = val_cl %>%
filter(count_group == 'count ≥ 5') %>%
select(tool, cell_line, perc_qPCR_val, perc_RR_val, perc_amp_val, perc_compound_val)
val_cl_anovaval_cl_anova$cell_line = factor(val_cl_anova$cell_line, levels = c('HLF', 'NCI-H23', 'SW480'))
val_cl_anova$tool = factor(val_cl_anova$tool, levels = c("CIRCexplorer3", "CirComPara2", "circRNA_finder", "circseq_cup", "CircSplice", "circtools","CIRI2", "CIRIquant", "ecircscreen","find_circ", "KNIFE", "NCLcomparator", "NCLscan", "PFv2","Sailfish-cir", "segemehl"))## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.008126 0.004063 6.586 0.00453 **
## tool 14 0.024932 0.001781 2.887 0.00821 **
## Residuals 28 0.017273 0.000617
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] 0.1614512
## [1] 0.4953607
## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.05260 0.026300 8.642 0.00119 **
## tool 14 0.16010 0.011436 3.758 0.00140 **
## Residuals 28 0.08521 0.003043
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] 0.6449398
## [1] 0.2804385
## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.0187 0.00935 3.375 0.0486 *
## tool 14 1.2629 0.09020 32.555 1.32e-13 ***
## Residuals 28 0.0776 0.00277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] 0.09136213
## [1] 0.8813758
## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.1250 0.06252 17.73 1.06e-05 ***
## tool 14 1.3220 0.09443 26.77 1.59e-12 ***
## Residuals 28 0.0988 0.00353
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] 0.3895813
## [1] 0.5884222
## Rows: 675 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool_1, tool_2, cell_line, count_group, tool_lt_1, tool_lt_2, tool_...
## dbl (9): nr_union, nr_intersection, perc_compound_val_1, perc_compound_val_2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
union_test = simple_union %>%
filter(count_group == "count ≥ 5",
!tool_1 == tool_2) %>%
left_join(tool_annotation %>%
select(tool, approach) %>%
rename(tool_1 = tool, approach_1 = approach)) %>%
left_join(tool_annotation %>%
select(tool, approach) %>%
rename(tool_2 = tool, approach_2 = approach)) %>%
mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
select(tool_1, tool_2, cell_line, perc_increase_t1, approach)## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test %>%
group_by(approach) %>%
summarise(median_increase_perc = median(perc_increase_t1),
mean_increase_perc = mean(perc_increase_t1))union_test = simple_union %>%
filter(count_group == "count ≥ 5",
!tool_1 == tool_2) %>%
left_join(tool_annotation %>%
select(tool, lin_annotation) %>%
rename(tool_1 = tool, approach_1 = lin_annotation)) %>%
left_join(tool_annotation %>%
select(tool, lin_annotation) %>%
rename(tool_2 = tool, approach_2 = lin_annotation)) %>%
mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
select(tool_1, tool_2, cell_line, perc_increase_t1, approach)## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test %>%
group_by(approach) %>%
summarise(median_increase_perc = median(perc_increase_t1),
mean_increase_perc = mean(perc_increase_t1))union_test = simple_union %>%
filter(count_group == "count ≥ 5",
!tool_1 == tool_2) %>%
left_join(tool_annotation %>%
select(tool, strand_anno) %>%
rename(tool_1 = tool, approach_1 = strand_anno)) %>%
left_join(tool_annotation %>%
select(tool, strand_anno) %>%
rename(tool_2 = tool, approach_2 = strand_anno)) %>%
filter(!approach_1 == "no strand reported",
!approach_2 == 'no strand reported') %>%
mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
select(tool_1, tool_2, cell_line, perc_increase_t1, approach)## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test %>%
group_by(approach) %>%
summarise(median_increase_perc = median(perc_increase_t1),
mean_increase_perc = mean(perc_increase_t1))union_test = simple_union %>%
filter(count_group == "count ≥ 5",
!tool_1 == tool_2) %>%
left_join(tool_annotation %>%
select(tool, splicing) %>%
rename(tool_1 = tool, approach_1 = splicing)) %>%
left_join(tool_annotation %>%
select(tool, splicing) %>%
rename(tool_2 = tool, approach_2 = splicing)) %>%
mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
select(tool_1, tool_2, cell_line, perc_increase_t1, approach)## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test %>%
group_by(approach) %>%
summarise(median_increase_perc = median(perc_increase_t1),
mean_increase_perc = mean(perc_increase_t1))## Rows: 2303689 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group,...
## dbl (9): start, end, count_UT, count_T, nr_reads_UT, nr_reads_T, cpm_T, cpm_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cont_table = treatment %>%
# remove Sailfish-cir as it does not report counts
filter(!tool == 'Sailfish-cir') %>%
# remove NAs
filter(!is.na(count_UT), !is.na(count_T)) %>%
# add median count group
left_join(all_circ %>%
select(circ_id_strand, count_group_median, cell_line) %>% unique()) %>%
select(circ_id_strand, cell_line, tool, enrichment_bin, count_group_median) %>%
group_by(count_group_median) %>%
count(enrichment_bin) %>%
pivot_wider(values_from = n, names_from = enrichment_bin) %>%
ungroup() %>%
select(-count_group_median)## Joining with `by = join_by(cell_line, circ_id_strand)`
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")
cont_table## enriched not enriched
## count < 5 302867 53352.05
## count ≥ 5 119751 21094.95
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 1166.7, df = 1, p-value < 2.2e-16
OR
## [1] 1.37386